library(tidyverse)
library(ggplot2)
::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) knitr
Challenge 8 Solutions
Read in data:
I will be using the ‘birds’ data set and the ‘FAOstat Regions’ data set.
*Please note that I added ‘print(head())’ to each data frame to shorten the output, but the codes are not rendering.*
# read in data sets:
<- read_csv("_data/birds.csv")
birds select(birds, -contains("Code"))
# A tibble: 30,977 × 9
Domain Area Element Item Year Unit Value Flag Flag …¹
<chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr>
1 Live Animals Afghanistan Stocks Chickens 1961 1000 Head 4700 F FAO es…
2 Live Animals Afghanistan Stocks Chickens 1962 1000 Head 4900 F FAO es…
3 Live Animals Afghanistan Stocks Chickens 1963 1000 Head 5000 F FAO es…
4 Live Animals Afghanistan Stocks Chickens 1964 1000 Head 5300 F FAO es…
5 Live Animals Afghanistan Stocks Chickens 1965 1000 Head 5500 F FAO es…
6 Live Animals Afghanistan Stocks Chickens 1966 1000 Head 5800 F FAO es…
7 Live Animals Afghanistan Stocks Chickens 1967 1000 Head 6600 F FAO es…
8 Live Animals Afghanistan Stocks Chickens 1968 1000 Head 6290 <NA> Offici…
9 Live Animals Afghanistan Stocks Chickens 1969 1000 Head 6300 F FAO es…
10 Live Animals Afghanistan Stocks Chickens 1970 1000 Head 6000 F FAO es…
# … with 30,967 more rows, and abbreviated variable name ¹`Flag Description`
unique(birds$Item)
[1] "Chickens" "Ducks" "Geese and guinea fowls"
[4] "Turkeys" "Pigeons, other birds"
unique(birds$Flag)
[1] "F" NA "Im" "M" "*" "A"
<- birds%>%
birds_aggregated filter(Flag=="A")
<- birds%>%
birds_split filter(Flag!="A")
unique(birds_split$Flag)
[1] "F" "Im" "M" "*"
print(head(birds_split))
# A tibble: 6 × 14
Domai…¹ Domain Area …² Area Eleme…³ Element Item …⁴ Item Year …⁵ Year Unit
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <chr>
1 QA Live … 2 Afgh… 5112 Stocks 1057 Chic… 1961 1961 1000…
2 QA Live … 2 Afgh… 5112 Stocks 1057 Chic… 1962 1962 1000…
3 QA Live … 2 Afgh… 5112 Stocks 1057 Chic… 1963 1963 1000…
4 QA Live … 2 Afgh… 5112 Stocks 1057 Chic… 1964 1964 1000…
5 QA Live … 2 Afgh… 5112 Stocks 1057 Chic… 1965 1965 1000…
6 QA Live … 2 Afgh… 5112 Stocks 1057 Chic… 1966 1966 1000…
# … with 3 more variables: Value <dbl>, Flag <chr>, `Flag Description` <chr>,
# and abbreviated variable names ¹`Domain Code`, ²`Area Code`,
# ³`Element Code`, ⁴`Item Code`, ⁵`Year Code`
print(head(birds_aggregated))
# A tibble: 6 × 14
Domai…¹ Domain Area …² Area Eleme…³ Element Item …⁴ Item Year …⁵ Year Unit
<chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <chr>
1 QA Live … 5000 World 5112 Stocks 1057 Chic… 1961 1961 1000…
2 QA Live … 5000 World 5112 Stocks 1057 Chic… 1962 1962 1000…
3 QA Live … 5000 World 5112 Stocks 1057 Chic… 1963 1963 1000…
4 QA Live … 5000 World 5112 Stocks 1057 Chic… 1964 1964 1000…
5 QA Live … 5000 World 5112 Stocks 1057 Chic… 1965 1965 1000…
6 QA Live … 5000 World 5112 Stocks 1057 Chic… 1966 1966 1000…
# … with 3 more variables: Value <dbl>, Flag <chr>, `Flag Description` <chr>,
# and abbreviated variable names ¹`Domain Code`, ²`Area Code`,
# ³`Element Code`, ⁴`Item Code`, ⁵`Year Code`
unique(birds_aggregated$Area)
[1] "World" "Africa"
[3] "Eastern Africa" "Middle Africa"
[5] "Northern Africa" "Southern Africa"
[7] "Western Africa" "Americas"
[9] "Northern America" "Central America"
[11] "Caribbean" "South America"
[13] "Asia" "Central Asia"
[15] "Eastern Asia" "Southern Asia"
[17] "South-eastern Asia" "Western Asia"
[19] "Europe" "Eastern Europe"
[21] "Northern Europe" "Southern Europe"
[23] "Western Europe" "Oceania"
[25] "Australia and New Zealand" "Melanesia"
[27] "Micronesia" "Polynesia"
<- read_csv("_data\\FAOSTAT_country_groups.csv")
FAO_region <- FAO_region%>%
FAO_region_country select(`Country Group`, `Country`)%>%
rename(country_group = `Country Group`)%>%
distinct()
head(FAO_region_country)
# A tibble: 6 × 2
country_group Country
<chr> <chr>
1 Africa Algeria
2 Africa Angola
3 Africa Benin
4 Africa Botswana
5 Africa Burkina Faso
6 Africa Burundi
unique(FAO_region_country$country_group)
[1] "Africa"
[2] "Americas"
[3] "Annex I countries"
[4] "Antarctic Region"
[5] "Asia"
[6] "Australia and New Zealand"
[7] "Caribbean"
[8] "Caucasus and Central Asia"
[9] "Central America"
[10] "Central Asia"
[11] "Central Asia and Southern Asia"
[12] "Eastern Africa"
[13] "Eastern Asia"
[14] "Eastern Asia (excluding China)"
[15] "Eastern Asia (excluding Japan and China)"
[16] "Eastern Asia (excluding Japan)"
[17] "Eastern Asia and South-eastern Asia"
[18] "Eastern Europe"
[19] "Europe"
[20] "European Union (27)"
[21] "High-income economies"
[22] "Land Locked Developing Countries"
[23] "Latin America and the Caribbean"
[24] "Least Developed Countries"
[25] "Low income economies"
[26] "Low Income Food Deficit Countries"
[27] "Lower-middle-income economies"
[28] "Melanesia"
[29] "Micronesia"
[30] "Middle Africa"
[31] "Net Food Importing Developing Countries"
[32] "Non-Annex I countries"
[33] "North and Central America"
[34] "Northern Africa"
[35] "Northern Africa (excluding Sudan)"
[36] "Northern America"
[37] "Northern America and Europe"
[38] "Northern Europe"
[39] "Oceania"
[40] "Oceania excluding Australia and New Zealand"
[41] "OECD"
[42] "Polynesia"
[43] "Small Island Developing States"
[44] "South America"
[45] "South-eastern Asia"
[46] "Southern Africa"
[47] "Southern Asia"
[48] "Southern Asia (excluding India)"
[49] "Southern Europe"
[50] "Sub-Saharan Africa"
[51] "Sub-Saharan Africa (including Sudan)"
[52] "Upper-middle-income economies"
[53] "Western Africa"
[54] "Western Asia"
[55] "Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)"
[56] "Western Asia and Northern Africa"
[57] "Western Europe"
[58] "World"
# test:
%>%
FAO_region_countryfilter(country_group == "Africa")
# A tibble: 63 × 2
country_group Country
<chr> <chr>
1 Africa Algeria
2 Africa Angola
3 Africa Benin
4 Africa Botswana
5 Africa Burkina Faso
6 Africa Burundi
7 Africa Cabo Verde
8 Africa Cameroon
9 Africa Central African Republic
10 Africa Chad
# … with 53 more rows
Briefly describe the data
The ‘birds’ data set contains information related to stocks of different types of poultry: chickens, ducks, geese and guinea fowls, turkeys, and pigeons or other birds. After filtering out the aggregate data by area (my new data set now has 14 columns and 13716 rows), we can see the 28 regions included in the data set.
The ‘FAO_region_country’ data set contains information from the United Nation’s Food and Agriculture Association. This data set includes more detailed information related to the regions referenced in the ‘birds’ data set, specifically, the countries that are in each region. After creating a subset of the original data set that includes only “country” and “country_group”, I now have a data set with 2 columns and 1943 rows of data.
Tidy Data (as needed)
After reviewing the values within the ‘FAO_region_country’ data set, I can see that there is a value for “World” in the “country_group” column. So, I will remove this, as its data will overlap with that of other distinct countries.
install.packages("kableExtra")
Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(kableExtra)
library(readxl)
# remove "World" group
<- FAO_region_country%>%
FAO_region_cln filter(country_group!="World")
# create kable:
%>%
FAO_region_clngroup_by(country_group)%>%
summarize(n=n())%>%
arrange(desc(n))
# A tibble: 57 × 2
country_group n
<chr> <int>
1 Non-Annex I countries 161
2 Net Food Importing Developing Countries 81
3 Annex I countries 78
4 High-income economies 64
5 Africa 63
6 Europe 63
7 Americas 61
8 Small Island Developing States 58
9 Upper-middle-income economies 56
10 Low Income Food Deficit Countries 55
# … with 47 more rows
<- c(1:round(nrow(FAO_region_cln)/2))
half ::kable(list(FAO_region_cln[half,],
knitrmatrix(numeric(), nrow=0, ncol=1),
-half,]),
FAO_region_cln[caption = "Countries in Country Groups")%>%
::kable_styling(font_size=12) kableExtra
|
|
|
There is still some overlap among country groups even after removing “World.” I need to extract country-level or regional groupings to see what information was aggregated in the original data set. Using Professor Rolfe’s example, I will identify four major grouping categories (annex, income, OECD, and major region) and the 277 countries nested within those categories.
%>%
FAO_region_clnsummarise(n=n())/277
n
1 6.01444
%>%
FAO_region_clnfilter(str_detect(country_group, "[aA]nnex"))%>%
group_by(country_group)%>%
summarise(n=n())
# A tibble: 2 × 2
country_group n
<chr> <int>
1 Annex I countries 78
2 Non-Annex I countries 161
%>%
FAO_region_clnfilter(str_detect(country_group, "[aA]nnex"))%>%
summarise(n=n())
# A tibble: 1 × 1
n
<int>
1 239
%>%
FAO_region_clnfilter(str_detect(country_group, "[iI]ncome"))%>%
group_by(country_group)%>%
summarise(n=n())
# A tibble: 5 × 2
country_group n
<chr> <int>
1 High-income economies 64
2 Low income economies 34
3 Low Income Food Deficit Countries 55
4 Lower-middle-income economies 46
5 Upper-middle-income economies 56
%>%
FAO_region_clnfilter(str_detect(country_group, "[iI]ncome"))%>%
summarise(n=n())
# A tibble: 1 × 1
n
<int>
1 255
%>%
FAO_region_clnfilter(str_detect(country_group, "[Dd]evelop|OECD"))%>%
group_by(country_group)%>%
summarise(n=n())
# A tibble: 5 × 2
country_group n
<chr> <int>
1 Land Locked Developing Countries 32
2 Least Developed Countries 51
3 Net Food Importing Developing Countries 81
4 OECD 36
5 Small Island Developing States 58
%>%
FAO_region_clnfilter(str_detect(country_group, "[Dd]evelop|OECD"))%>%
summarise(n=n())
# A tibble: 1 × 1
n
<int>
1 258
# group by major regions:
<- c("Africa", "Asia", "Europe", "Americas",
major_regions "Oceania", "Antarctic Region")
%>%
FAO_region_clnfilter(country_group %in% major_regions)%>%
summarise(n=n())
# A tibble: 1 × 1
n
<int>
1 277
I will now pivot the data and create four new categorical variables corresponding to the four groupings (annex, income, OECD, and major region) that contain most or all of the 277 countries included in the ’FAO_region_cln” data set.
# pivot and unite
<- FAO_region_cln%>%
FAO_regions_wide pivot_wider(names_from=country_group, values_from = 1)%>%
unite("gp_annex", contains("Annex"),
sep="", na.rm=TRUE, remove=TRUE)%>%
unite("gp_major_region", any_of(major_regions),
sep="", na.rm=TRUE, remove=TRUE)%>%
unite("gp_income", contains("Income")|contains("income"),
sep="", na.rm=TRUE, remove=TRUE)%>%
unite("gp_develop", contains("Develop")|contains("OECD"),
sep="", na.rm=TRUE, remove=TRUE)%>%
select(Country, starts_with("gp"))
Join Data
I will now join the ‘birds_split’ data set (“Area”) with the pivoted ‘FAO_regions_wide’ data set (country and regional variables). I will do a left join, as each case includes a country, and each country should match the four regional indicators. I will also set the two key fields as the “Area” variable in the ‘birds_split’ data set and the “country” variable in the ‘FAO_regions_wide’ data set.
I can see based on the number of observations that the joined data set includes the correct number of observations (13716).
nrow(birds_split)
[1] 13716
<- left_join(birds_split, FAO_regions_wide,
birds_join by = c("Area" = "Country"))